Fig-3: Data Preparation
rm(list=ls(all=TRUE))
pacman::p_load(magrittr,latex2exp,Matrix,dplyr,tidyr,ggplot2,caTools,plotly)
load("data/tf0.rdata")Remove data after the demarcation date
feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01) # 618212X = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 88387summary(X)## tid date cust age
## Min. : 1 Min. :2000-11-01 Length:88387 Length:88387
## 1st Qu.:22098 1st Qu.:2000-11-23 Class :character Class :character
## Median :44194 Median :2000-12-12 Mode :character Mode :character
## Mean :44194 Mean :2000-12-15
## 3rd Qu.:66291 3rd Qu.:2001-01-12
## Max. :88387 Max. :2001-01-31
## area items pieces total
## Length:88387 Min. : 1.000 Min. : 1.000 Min. : 5.0
## Class :character 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 230.0
## Mode :character Median : 5.000 Median : 6.000 Median : 522.0
## Mean : 6.994 Mean : 9.453 Mean : 888.7
## 3rd Qu.: 9.000 3rd Qu.: 12.000 3rd Qu.: 1120.0
## Max. :112.000 Max. :339.000 Max. :30171.0
## gross
## Min. :-1645.0
## 1st Qu.: 23.0
## Median : 72.0
## Mean : 138.3
## 3rd Qu.: 174.0
## Max. : 8069.0
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))## items pieces total gross
## 99.9% 56.0000 84.0000 9378.684 1883.228
## 99.95% 64.0000 98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295d0 = max(X$date) + 1
A = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
) %>% data.frame # 28584
nrow(A)## [1] 28584
feb = filter(X0, date>= feb01) %>% group_by(cust) %>%
summarise(amount = sum(total)) # 16900A$amountSimply a Left Joint
A = merge(A, feb, by="cust", all.x=T)A$buyA$buy = !is.na(A$amount)
table(A$buy, !is.na(A$amount))##
## FALSE TRUE
## FALSE 15342 0
## TRUE 0 13242
summary(A)## cust r s f
## Length:28584 Min. : 1.00 Min. : 1.00 Min. : 1.000
## Class :character 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
## Mode :character Median :21.00 Median :68.00 Median : 2.000
## Mean :32.12 Mean :61.27 Mean : 3.089
## 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
## Max. :92.00 Max. :92.00 Max. :60.000
##
## m rev raw age
## Min. : 8.0 Min. : 8 Min. : -742.0 Length:28584
## 1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0 Class :character
## Median : 709.5 Median : 1566 Median : 218.0 Mode :character
## Mean : 1012.4 Mean : 2711 Mean : 420.8
## 3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0
## Max. :10634.0 Max. :99597 Max. :15565.0
##
## area amount buy
## Length:28584 Min. : 8 Mode :logical
## Class :character 1st Qu.: 454 FALSE:15342
## Mode :character Median : 993 TRUE :13242
## Mean : 1499
## 3rd Qu.: 1955
## Max. :28089
## NA's :15342
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))## [1] 28584 20008 8576
cbind(A, spl) %>% filter(buy) %>%
ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))## [1] 13242 9269 3973
cbind(A2, spl2) %>%
ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)save(Z, X, A, spl, spl2, file="data/tf3.rdata")Fig-4: Data Spliting
TR = subset(A, spl)
TS = subset(A, !spl)glm1 = glm(buy ~ ., TR[,c(2:9, 11)], family=binomial())
summary(glm1)##
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR[, c(2:9,
## 11)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7931 -0.8733 -0.6991 1.0384 1.8735
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.259e+00 1.261e-01 -9.985 < 2e-16 ***
## r -1.227e-02 8.951e-04 -13.708 < 2e-16 ***
## s 9.566e-03 9.101e-04 10.511 < 2e-16 ***
## f 2.905e-01 1.593e-02 18.233 < 2e-16 ***
## m -3.028e-05 2.777e-05 -1.090 0.27559
## rev 4.086e-05 1.940e-05 2.106 0.03521 *
## raw -2.306e-04 8.561e-05 -2.693 0.00708 **
## agea29 -4.194e-02 8.666e-02 -0.484 0.62838
## agea34 1.772e-02 7.992e-02 0.222 0.82456
## agea39 7.705e-02 7.921e-02 0.973 0.33074
## agea44 8.699e-02 8.132e-02 1.070 0.28476
## agea49 1.928e-02 8.457e-02 0.228 0.81962
## agea54 1.745e-02 9.323e-02 0.187 0.85155
## agea59 1.752e-01 1.094e-01 1.602 0.10926
## agea64 6.177e-02 1.175e-01 0.526 0.59904
## agea69 2.652e-01 1.047e-01 2.533 0.01131 *
## agea99 -1.419e-01 1.498e-01 -0.947 0.34347
## areaz106 -4.105e-02 1.321e-01 -0.311 0.75603
## areaz110 -2.075e-01 1.045e-01 -1.986 0.04703 *
## areaz114 3.801e-02 1.111e-01 0.342 0.73214
## areaz115 2.599e-01 9.682e-02 2.684 0.00727 **
## areaz221 1.817e-01 9.753e-02 1.863 0.06243 .
## areazOthers -4.677e-02 1.045e-01 -0.448 0.65435
## areazUnknown -1.695e-01 1.232e-01 -1.375 0.16912
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27629 on 20007 degrees of freedom
## Residual deviance: 23295 on 19984 degrees of freedom
## AIC: 23343
##
## Number of Fisher Scoring iterations: 5
pred = predict(glm1, TS, type="response")
cm = table(actual = TS$buy, predict = pred > 0.5); cm## predict
## actual FALSE TRUE
## FALSE 3730 873
## TRUE 1700 2273
acc.ts = cm %>% {sum(diag(.))/sum(.)}
c(1-mean(TS$buy) , acc.ts) # 0.69998## [1] 0.5367304 0.6999767
colAUC(pred, TS$buy) # 0.7556## [,1]
## FALSE vs. TRUE 0.7556038
A2 = subset(A, A$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)lm1 = lm(amount ~ ., TR2[,c(2:6,8:10)])
summary(lm1)##
## Call:
## lm(formula = amount ~ ., data = TR2[, c(2:6, 8:10)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.02874 -0.23292 0.05011 0.28423 1.45108
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.1651034 0.0501997 23.209 < 2e-16 ***
## r 0.0003390 0.0003138 1.080 0.27999
## s 0.0002380 0.0003164 0.752 0.45200
## f 0.0266666 0.0018112 14.723 < 2e-16 ***
## m 0.5165187 0.0375332 13.762 < 2e-16 ***
## rev 0.0240517 0.0363911 0.661 0.50868
## agea29 0.0471837 0.0252728 1.867 0.06194 .
## agea34 0.0896806 0.0233116 3.847 0.00012 ***
## agea39 0.1203331 0.0229212 5.250 1.56e-07 ***
## agea44 0.1107960 0.0235428 4.706 2.56e-06 ***
## agea49 0.0649780 0.0244296 2.660 0.00783 **
## agea54 0.0838574 0.0266168 3.151 0.00163 **
## agea59 0.0395519 0.0310826 1.272 0.20323
## agea64 0.0059463 0.0325943 0.182 0.85525
## agea69 -0.0399961 0.0289299 -1.383 0.16685
## agea99 0.0892782 0.0408041 2.188 0.02870 *
## areaz106 0.0955455 0.0427171 2.237 0.02533 *
## areaz110 0.0526326 0.0347075 1.516 0.12944
## areaz114 0.0154046 0.0364721 0.422 0.67277
## areaz115 0.0193686 0.0317954 0.609 0.54243
## areaz221 0.0350306 0.0320485 1.093 0.27440
## areazOthers 0.0385476 0.0344270 1.120 0.26288
## areazUnknown 0.0130805 0.0387052 0.338 0.73541
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4249 on 9246 degrees of freedom
## Multiple R-squared: 0.2796, Adjusted R-squared: 0.2779
## F-statistic: 163.1 on 22 and 9246 DF, p-value: < 2.2e-16
r2.tr = summary(lm1)$r.sq
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm1, TS2) - TS2$amount)^2)
r2.ts = 1 - (SSE/SST)
c(R2train=r2.tr, R2test=r2.ts)## R2train R2test
## 0.2795931 0.2845795
###加入品項數與顧客分群
A00 = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>%
group_by(cust) %>% summarise(
r = min(days),s = max(days), f = n(), m = mean(total), rev = sum(total), raw = sum(gross), age = age[1], area = area[1], items = mean(items), period=s-r ) %>% data.frame # 28584
nrow(A00)## [1] 28584
STS = c("N1","R1","R2","S1")
Status = function(rx,fx,sx,dx) {factor(
ifelse(dx == 0,
ifelse(rx <90 , "N1","S1"),
ifelse( dx/fx <10 ,"R2","R1")), STS)}
A1 = A00 %>% mutate(group=Status(r,f,s,period))
N1 = A1 %>% filter(group=="N1")%>%pull(cust)
S1 = A1 %>% filter(group=="S1")%>%pull(cust)
R1 = A1 %>% filter(group=="R1")%>%pull(cust)
R2 = A1 %>% filter(group=="R2")%>%pull(cust)A$amountSimply a Left Joint
A1 = merge(A1, feb, by="cust", all.x=T)A$buyA1$buy = !is.na(A1$amount)
table(A1$buy, !is.na(A1$amount))##
## FALSE TRUE
## FALSE 15342 0
## TRUE 0 13242
summary(A1)## cust r s f
## Length:28584 Min. : 1.00 Min. : 1.00 Min. : 1.000
## Class :character 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
## Mode :character Median :21.00 Median :68.00 Median : 2.000
## Mean :32.12 Mean :61.27 Mean : 3.089
## 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
## Max. :92.00 Max. :92.00 Max. :60.000
##
## m rev raw age
## Min. : 8.0 Min. : 8 Min. : -742.0 Length:28584
## 1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0 Class :character
## Median : 709.5 Median : 1566 Median : 218.0 Mode :character
## Mean : 1012.4 Mean : 2711 Mean : 420.8
## 3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0
## Max. :10634.0 Max. :99597 Max. :15565.0
##
## area items period group amount
## Length:28584 Min. : 1.00 Min. : 0.00 N1:11673 Min. : 8
## Class :character 1st Qu.: 3.00 1st Qu.: 0.00 R1:10452 1st Qu.: 454
## Mode :character Median : 6.00 Median :17.00 R2: 6246 Median : 993
## Mean : 7.76 Mean :29.14 S1: 213 Mean : 1499
## 3rd Qu.:10.00 3rd Qu.:59.00 3rd Qu.: 1955
## Max. :64.00 Max. :91.00 Max. :28089
## NA's :15342
## buy
## Mode :logical
## FALSE:15342
## TRUE :13242
##
##
##
##
X1 = subset(X, cust %in% A1$cust & date < as.Date("2001-02-01"))
Z1 = subset(Z, cust %in% A1$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl3 = sample.split(A1$buy, SplitRatio=0.7) #將資料用隨機數選擇後,進行切割
c(nrow(A1), sum(spl3), sum(!spl3))## [1] 28584 20008 8576
A12 = subset(A1, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A12)
set.seed(2018); spl4 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A12), sum(spl4), sum(!spl4))## [1] 13242 9269 3973
TR3 = subset(A1, spl3)
TS3 = subset(A1, !spl3)glm2 = glm(buy ~ ., TR3[,c(2:10,12, 14)], family=binomial())
summary(glm2)##
## Call:
## glm(formula = buy ~ ., family = binomial(), data = TR3[, c(2:10,
## 12, 14)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7678 -0.8627 -0.6774 1.0281 1.9573
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.355e+00 1.278e-01 -10.602 < 2e-16 ***
## r -8.028e-03 1.595e-03 -5.033 4.83e-07 ***
## s 5.137e-03 1.604e-03 3.203 0.00136 **
## f 2.946e-01 1.883e-02 15.646 < 2e-16 ***
## m -7.733e-05 3.397e-05 -2.276 0.02283 *
## rev 4.385e-05 1.945e-05 2.255 0.02415 *
## raw -2.666e-04 8.654e-05 -3.081 0.00207 **
## agea29 -2.916e-02 8.685e-02 -0.336 0.73703
## agea34 2.306e-02 8.009e-02 0.288 0.77344
## agea39 7.815e-02 7.940e-02 0.984 0.32504
## agea44 8.310e-02 8.151e-02 1.020 0.30793
## agea49 1.678e-02 8.475e-02 0.198 0.84301
## agea54 2.221e-02 9.340e-02 0.238 0.81203
## agea59 1.864e-01 1.096e-01 1.700 0.08905 .
## agea64 6.994e-02 1.176e-01 0.595 0.55200
## agea69 2.725e-01 1.049e-01 2.597 0.00939 **
## agea99 -1.301e-01 1.501e-01 -0.867 0.38598
## areaz106 -4.689e-02 1.326e-01 -0.354 0.72354
## areaz110 -2.111e-01 1.049e-01 -2.013 0.04415 *
## areaz114 3.772e-02 1.115e-01 0.338 0.73515
## areaz115 2.577e-01 9.719e-02 2.651 0.00802 **
## areaz221 1.725e-01 9.792e-02 1.761 0.07819 .
## areazOthers -4.527e-02 1.049e-01 -0.432 0.66601
## areazUnknown -1.667e-01 1.235e-01 -1.350 0.17701
## items 1.088e-02 3.660e-03 2.973 0.00294 **
## groupR1 3.274e-01 7.295e-02 4.488 7.18e-06 ***
## groupR2 2.474e-01 5.418e-02 4.567 4.95e-06 ***
## groupS1 -4.725e-03 2.039e-01 -0.023 0.98152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27629 on 20007 degrees of freedom
## Residual deviance: 23258 on 19980 degrees of freedom
## AIC: 23314
##
## Number of Fisher Scoring iterations: 5
pred2 = predict(glm2, TS3, type="response")
cm = table(actual = TS3$buy, predict = pred2 > 0.5); cm## predict
## actual FALSE TRUE
## FALSE 3641 962
## TRUE 1602 2371
acc.ts = cm %>% {sum(diag(.))/sum(.)}
c(1-mean(TS3$buy) , acc.ts) # 0.7010261## [1] 0.5367304 0.7010261
colAUC(pred2, TS3$buy) # 0.7564## [,1]
## FALSE vs. TRUE 0.7563764
tsAUC2 = colAUC(pred2, y=TS3$buy, plotROC=T)A12 = subset(A1, A1$buy) %>% mutate_at(c("m","rev","amount"), log10)
TR4 = subset(A12, spl4)
TS4 = subset(A12, !spl4)lm2 = lm(amount ~ ., TR4[,c(2:6,8:10,12,13)])
summary(lm2)##
## Call:
## lm(formula = amount ~ ., data = TR4[, c(2:6, 8:10, 12, 13)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98081 -0.22905 0.04637 0.28129 1.35949
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.365e+00 5.466e-02 24.981 < 2e-16 ***
## r -6.103e-05 4.243e-04 -0.144 0.885626
## s 4.504e-04 4.247e-04 1.060 0.289010
## f 1.921e-02 2.155e-03 8.911 < 2e-16 ***
## m 2.613e-01 5.489e-02 4.761 1.96e-06 ***
## rev 1.950e-01 5.248e-02 3.716 0.000203 ***
## agea29 4.846e-02 2.515e-02 1.927 0.054038 .
## agea34 9.036e-02 2.319e-02 3.896 9.85e-05 ***
## agea39 1.173e-01 2.280e-02 5.144 2.75e-07 ***
## agea44 1.038e-01 2.343e-02 4.431 9.50e-06 ***
## agea49 6.026e-02 2.431e-02 2.479 0.013197 *
## agea54 8.177e-02 2.648e-02 3.088 0.002023 **
## agea59 3.953e-02 3.092e-02 1.278 0.201172
## agea64 1.001e-02 3.244e-02 0.308 0.757716
## agea69 -3.997e-02 2.879e-02 -1.389 0.165002
## agea99 8.905e-02 4.061e-02 2.193 0.028345 *
## areaz106 9.675e-02 4.250e-02 2.277 0.022831 *
## areaz110 5.652e-02 3.453e-02 1.637 0.101757
## areaz114 2.543e-02 3.630e-02 0.701 0.483631
## areaz115 2.340e-02 3.164e-02 0.740 0.459558
## areaz221 3.907e-02 3.189e-02 1.225 0.220456
## areazOthers 4.569e-02 3.426e-02 1.334 0.182299
## areazUnknown 2.253e-02 3.852e-02 0.585 0.558721
## items 8.598e-03 1.068e-03 8.054 9.01e-16 ***
## groupR1 -1.140e-01 2.011e-02 -5.669 1.48e-08 ***
## groupR2 -1.058e-01 2.263e-02 -4.677 2.96e-06 ***
## groupS1 9.557e-02 7.310e-02 1.307 0.191090
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4227 on 9242 degrees of freedom
## Multiple R-squared: 0.2874, Adjusted R-squared: 0.2854
## F-statistic: 143.3 on 26 and 9242 DF, p-value: < 2.2e-16
r2.tr2 = summary(lm2)$r.sq
SST2 = sum((TS4$amount - mean(TR4$amount))^ 2)
SSE2 = sum((predict(lm2, TS4) - TS4$amount)^2)
r2.ts2 = 1 - (SSE/SST)
c(R2train2=r2.tr2, R2test2=r2.ts2)## R2train2 R2test2
## 0.2873732 0.2845795
Fig-3: Prediction
Aggregate data 2000-12-01 ~ 2001~02-28.
load("data/tf0.rdata")
d0 = max(X0$date) + 1
B = X0 %>%
filter(date >= as.Date("2000-12-01")) %>%
mutate(days = as.integer(difftime(d0, date, units="days"))) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
items = mean(items),
period=s-r
) %>% data.frame # 28584
nrow(B)## [1] 28531
STS = c("N1","R1","R2","S1")
Status = function(rx,fx,sx,dx) {factor(
ifelse(dx == 0,
ifelse(rx <90 , "N1","S1"),
ifelse( dx/fx <10 ,"R2","R1")), STS)}
B1 = B %>% mutate(group=Status(r,f,s,period))
N1 = B1 %>% filter(group=="N1")%>%pull(cust)
S1 = B1 %>% filter(group=="S1")%>%pull(cust)
R1 = B1 %>% filter(group=="R1")%>%pull(cust)
R2 = B1 %>% filter(group=="R2")%>%pull(cust)In B, there is a record for each customer.
B$Buy is the probability of buying in March.
B1$Buy = predict(glm1, B1, type="response")💡: 預測購買金額時要記得做指數、對數轉換!
B2 = B1 %>% mutate_at(c("m","rev"), log10)
B1$Rev = 10^predict(lm1, B2)par(mfrow=c(1,2), cex=0.8)
hist(B1$Buy)
hist(log(B1$Rev,10))save(B1, file='data/tf4.rdata')B1$Buy2 = predict(glm2, B1, type="response")B1$Rev2 = 10^predict(lm2, B2)par(mfrow=c(1,2), cex=0.8)
hist(B1$Buy2)
hist(log(B1$Rev2,10))hist(B1$Buy)
hist(log(B1$Rev,10))B1 %>% group_by(group)%>%summarise(buy=mean(Buy2),rev=mean(Rev2)) #各族群的回購機率和預期購買金額## # A tibble: 4 x 3
## group buy rev
## <fct> <dbl> <dbl>
## 1 N1 0.259 912.
## 2 R1 0.592 949.
## 3 R2 0.641 1122.
## 4 S1 0.232 1133.
sum(B1$raw)/sum(B1$rev) #公司目前獲利率## [1] 0.1567946
g = 0.15 # (稅前)獲利率
N = 5 # 期數 = 5
d = 0.1 # 利率 = 10%
B1$CLV = g * B1$Rev2 * rowSums(sapply(
0:N, function(i) (B1$Buy2/(1+d))^i ) )
summary(B1$CLV) #估計顧客終生價值(CLV)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.46 129.35 210.69 304.16 352.63 17338.44
par(mar=c(2,2,3,1), cex=0.8)
hist(log(B1$CLV,10), xlab="", ylab="")# 各族群的平均營收貢獻、保留機率、終生價值
B1 %>% group_by(group) %>% summarise_at(vars(Buy2:CLV), mean)## # A tibble: 4 x 4
## group Buy2 Rev2 CLV
## <fct> <dbl> <dbl> <dbl>
## 1 N1 0.259 912. 178.
## 2 R1 0.592 949. 317.
## 3 R2 0.641 1122. 524.
## 4 S1 0.232 1133. 215.
###繪製顧客終生價值對顧客狀態分群的盒狀圖。
par(mar=c(3,3,4,2), cex=0.8)
boxplot(log(CLV,10)~group, B1, main="CLV by Groups")
估計每位顧客的淨收益 \(\hat{R}(x)\)
m=0.2; b=25; a=40; x=30
DP = function(x,m0,b0,a0) {m0*plogis((10/a0)*(x-b0))}
dp = pmin(1-B1$Buy2, DP(x,m,b,a))
eR = dp*B1$Rev2*g - x
hist(eR,main="預期淨收益分佈",xlab="預期淨收益",ylab="顧客人數")mm=c(0.20, 0.25, 0.15, 0.25)
bb=c( 25, 30, 15, 30)
aa=c( 40, 40, 30, 60)
X = seq(0,60,2)
do.call(rbind, lapply(1:length(mm), function(i) data.frame(
Inst=paste0('Inst',i), Cost=X,
Gain=DP(X,mm[i],bb[i],aa[i])
))) %>% data.frame %>%
ggplot(aes(x=Cost, y=Gain, col=Inst)) +
geom_line(size=1.5,alpha=0.5) + theme_bw() +
ggtitle("Prob. Function: f(x|m,b,a)")X = seq(10, 60, 1)
df = do.call(rbind, lapply(1:length(mm), function(i) {
sapply(X, function(x) {
dp = pmin(1-B1$Buy2, DP(x,mm[i],bb[i],aa[i]))
eR = dp*B1$Rev2*g - x
c(i=i, x=x, eR.ALL=sum(eR), N=sum(eR>0), eR.SEL=sum(eR[eR > 0]) )
}) %>% t %>% data.frame
}))
df %>%
mutate_at(vars(eR.ALL, eR.SEL), function(y) round(y/1000)) %>%
gather('key','value',-i,-x) %>%
mutate(Instrument = paste0('I',i)) %>%
ggplot(aes(x=x, y=value, col=Instrument)) +
geom_hline(yintercept=0, linetype='dashed', col='blue') +
geom_line(size=1.5,alpha=0.5) +
xlab('工具選項(成本)') + ylab('預期收益($K)') +
ggtitle('行銷工具優化','假設行銷工具的效果是其成本的函數') +
facet_wrap(~key,ncol=1,scales='free_y') + theme_bw() -> p
plotly::ggplotly(p)ci = sapply(
list(c("N1"),c("R1"),
c("R2"),c("S1")),
function(v) B1$group %in% v)
X2 = seq(10, 60, 1)
df = do.call(rbind, lapply(1:length(mm), function(i) {
sapply(X2, function(x) {
dp = pmin(1- B1$Buy2[ ci[,i] ] , DP(x,mm[i],bb[i],aa[i]))
eR = dp* B1$Rev2[ ci[,i] ] *g - x
c(i=i, x=x, eR.ALL=sum(eR), N=sum(eR>0), eR.SEL=sum(eR[eR > 0]) )
}) %>% t %>% data.frame
}))
group_by(df, i) %>% top_n(1,eR.SEL)## # A tibble: 4 x 5
## # Groups: i [4]
## i x eR.ALL N eR.SEL
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 35 -114289. 2405 43944.
## 2 2 40 -85275. 2554 41533.
## 3 3 21 -35312. 1341 10911.
## 4 4 44 -573. 30 801.
🚴 討論行銷方案:
如果上述4組工具參數分別是4種行銷工具對4個顧客族群的效果:
■
I1 : N1
■ I2 : R1
■
I3 : R2
■ I4 : S1
針對這4個顧客族群之中選擇行銷對象:
■
選擇行銷對象(N)
N1族群中的2405位顧客;
R1族群中的2554位顧客;
R2族群中的1341位顧客;
S1族群中的30位顧客,總共6330位顧客做行銷。
■ 設定行銷工具的面額(x)
N1族群發送35元價值的折價券;
R1族群發送40元價值的滿額外送免運券;
R2族群發送21元價值的組合包優惠;
S1族群發送44元價值的試吃包。
■ 估計預期報償(eR.SEL)?
N1族群帶來報償為43943.6元;
R1族群帶來報償為41533.04元;
R2族群帶來報償為10911.4元;
S1族群帶來報償為801.18元,總預期報償為97,189.22元。